In this Analysis step, the Differential Expression Analysis (DEA) is
performed using the DESeq2 package.
After performing the DEA, the quality of the data is investigated with a look at the dispersion estimates, size factors and MA plots.
Goal is to find the most significant differential expressed genes. These will be visualized using Volcano plots. To compare the two tissues, the shared significant genes are plotted against each other and a heatmap with hierarchical clustering is performed on all significant genes of both tissues combined followed by a principle component analysis.
For the DESeq2 analysis, the raw, prefiltered counts will be used.
se.gastroc <- readRDS("./data/Robjects/01_se.gastroc.rds")
se.soleus <- readRDS("./data/Robjects/01_se.soleus.rds")
counts.gastroc <- se.gastroc[rowData(se.gastroc)$filtered,] %>% assay()
counts.soleus <- se.soleus[rowData(se.soleus)$filtered,] %>% assay()
metadata <- colData(se.gastroc)
The DDS objects are created for both tissues separately to investigate the differential expression in regard to the genotype (condition of KO or WT).
# changing to factor (needs DESeq)
metadata$genotype <- as.factor(metadata$genotype)
createDDSObject <- function(counts, metadata) {
# select sample columns
reorder_index <- match(rownames(metadata), colnames(counts))
counts <- counts[,reorder_index] %>%
apply(c(1,2), as.integer) # DESeq need normal counts (integer) as input
# Check metadata consistency
all(rownames(metadata) %in% colnames(counts)) %>%
assertthat::assert_that(., msg = "metadata and count table do not match")
## DESeq2 object
dds <- DESeqDataSetFromMatrix(countData = counts,
colData = metadata,
design = ~ genotype)
return( DESeq(dds) )
}
# creating DESeq Objects
dds.gastroc <- createDDSObject(counts.gastroc, metadata)
dds.soleus <- createDDSObject(counts.soleus, metadata)
Performing the Differential Expression Analysis and gathering the results from the DDS objects.
res.gastroc <- results(dds.gastroc, alpha = params$pCutoff, contrast = c("genotype", "KO", "WT"))
res.soleus <- results(dds.soleus, alpha = params$pCutoff, contrast = c("genotype", "KO", "WT"))
Also applying lfcShrink for comparison in volcano plots. Besides that it’s application was not further used.
# lfcShrink is not applied by default
# library(ashr) # using instead of "apeglm", because https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#extended-section-on-shrinkage-estimators
resLFC.gastroc <- lfcShrink(dds.gastroc, contrast = c("genotype", "KO", "WT"), type = "ashr")
using 'ashr' for LFC shrinkage. If used in published research, please cite:
Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
https://doi.org/10.1093/biostatistics/kxw041
resLFC.soleus <- lfcShrink(dds.soleus, contrast = c("genotype", "KO", "WT"), type = "ashr")
using 'ashr' for LFC shrinkage. If used in published research, please cite:
Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
https://doi.org/10.1093/biostatistics/kxw041
Filtering outliers using the cooks distance is done automatically and
p-values are set to NA. Changing (lowering) the
minReplicatesForReplace will replace these flagged
values.
DESeq2 flags these outliers and removes the corresponding genes completely, if below 7 samples (above would replace the values)
# assuming all NA p-values are the ones filtered out using the cooks distance
print_outliers <- function(res, se){
# all genes with a `NA` value are exactly those 5 which were also detected by the cooks distance:
outlier_ids <- res %>%
as.data.frame() %>%
filter(is.na(pvalue)) %>%
rownames()
# a different approach would be finding the genes which have a cooks distance
# above the cutoff:
# cooksCutoff <- qf(.99, 2,10)
# outlier_ids <- which(mcols(dds)$maxCooks > cooksCutoff) %>% names()
# getting counts and metadata
outlier_metadata.df <- rowData(se) %>%
as.data.frame() %>%
select(gene_name, gene_biotype)
counts.df <- assay(se)
# merging df
outliers.df <-
merge(outlier_metadata.df, counts.df[outlier_ids,], by = 0)
return(outliers.df)
}
outliers_gastroc.df <- print_outliers(res.gastroc, se.gastroc) %>% tibble::column_to_rownames(var = "Row.names")
write.csv(outliers_gastroc.df, file = "./code/analysis/02_outliers_gastroc.csv")
outliers_gastroc.df %>% knitr::kable()
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
| gene_name | gene_biotype | WT_1 | WT_3 | WT_5 | WT_9 | WT_11 | WT_15 | KO_2 | KO_4 | KO_6 | KO_8 | KO_10 | KO_16 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ENSMUSG00000027375 | Mal | protein_coding | 496 | 82 | 87 | 88 | 118 | 71 | 112 | 80 | 70 | 99 | 164 | 200 |
| ENSMUSG00000039579 | Grin3a | protein_coding | 4 | 1 | 5 | 3 | 0 | 3 | 12 | 5 | 11 | 2 | 9 | 142 |
| ENSMUSG00000041607 | Mbp | protein_coding | 3390 | 462 | 524 | 654 | 836 | 536 | 849 | 493 | 456 | 606 | 1305 | 1094 |
| ENSMUSG00000053198 | Prx | protein_coding | 1190 | 159 | 236 | 327 | 346 | 200 | 313 | 196 | 197 | 237 | 428 | 372 |
| ENSMUSG00000056569 | Mpz | protein_coding | 7949 | 1168 | 1229 | 1562 | 1657 | 996 | 1792 | 1181 | 877 | 1447 | 2681 | 2691 |
print_outliers(res.soleus, se.soleus) %>% knitr::kable()
| Row.names | gene_name | gene_biotype | WT_1 | WT_3 | WT_5 | WT_9 | WT_11 | WT_15 | KO_2 | KO_4 | KO_6 | KO_8 | KO_10 | KO_16 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ENSMUSG00000000290 | Itgb2 | protein_coding | 22 | 47 | 45 | 50 | 72 | 18 | 38 | 457 | 44 | 58 | 66 | 63 |
| ENSMUSG00000000682 | Cd52 | protein_coding | 8 | 34 | 13 | 18 | 19 | 24 | 10 | 204 | 23 | 15 | 21 | 22 |
| ENSMUSG00000000982 | Ccl3 | protein_coding | 0 | 0 | 0 | 3 | 1 | 0 | 1 | 25 | 0 | 0 | 3 | 0 |
| ENSMUSG00000001131 | Timp1 | protein_coding | 21 | 39 | 25 | 32 | 30 | 38 | 94 | 1080 | 85 | 33 | 81 | 68 |
| ENSMUSG00000002985 | Apoe | protein_coding | 762 | 1049 | 1260 | 1220 | 1418 | 1005 | 1417 | 10248 | 1592 | 1330 | 1474 | 1369 |
| ENSMUSG00000003484 | Cyp4f18 | protein_coding | 2 | 7 | 3 | 8 | 9 | 3 | 17 | 166 | 10 | 15 | 12 | 4 |
| ENSMUSG00000003882 | Il7r | protein_coding | 3 | 3 | 1 | 7 | 2 | 0 | 2 | 129 | 3 | 5 | 6 | 6 |
| ENSMUSG00000004707 | Ly9 | protein_coding | 0 | 0 | 4 | 5 | 8 | 0 | 4 | 122 | 5 | 2 | 0 | 1 |
| ENSMUSG00000005087 | Cd44 | protein_coding | 183 | 229 | 200 | 236 | 240 | 190 | 324 | 1750 | 335 | 262 | 414 | 265 |
| ENSMUSG00000006519 | Cyba | protein_coding | 135 | 138 | 144 | 132 | 124 | 165 | 177 | 732 | 128 | 166 | 214 | 129 |
| ENSMUSG00000015947 | Fcgr1 | protein_coding | 6 | 5 | 12 | 6 | 10 | 3 | 7 | 143 | 15 | 4 | 10 | 6 |
| ENSMUSG00000015950 | Ncf1 | protein_coding | 22 | 51 | 59 | 45 | 71 | 18 | 67 | 497 | 76 | 36 | 75 | 55 |
| ENSMUSG00000017716 | Birc5 | protein_coding | 7 | 21 | 7 | 8 | 15 | 16 | 16 | 227 | 18 | 19 | 18 | 21 |
| ENSMUSG00000018927 | Ccl6 | protein_coding | 83 | 164 | 172 | 135 | 175 | 95 | 206 | 1248 | 182 | 114 | 204 | 109 |
| ENSMUSG00000019876 | Pkib | protein_coding | 2 | 2 | 6 | 2 | 4 | 4 | 4 | 92 | 3 | 3 | 1 | 4 |
| ENSMUSG00000019987 | Arg1 | protein_coding | 5 | 6 | 2 | 13 | 6 | 1 | 8 | 108 | 6 | 17 | 10 | 9 |
| ENSMUSG00000020120 | Plek | protein_coding | 17 | 39 | 30 | 32 | 48 | 22 | 36 | 407 | 44 | 32 | 21 | 29 |
| ENSMUSG00000020865 | Abcc3 | protein_coding | 14 | 13 | 22 | 16 | 19 | 11 | 17 | 184 | 23 | 19 | 23 | 13 |
| ENSMUSG00000021091 | Serpina3n | protein_coding | 19 | 22 | 29 | 22 | 22 | 22 | 65 | 404 | 42 | 31 | 49 | 57 |
| ENSMUSG00000021175 | Cdca7l | protein_coding | 0 | 3 | 2 | 2 | 0 | 0 | 0 | 34 | 2 | 2 | 0 | 2 |
| ENSMUSG00000021886 | Gpr65 | protein_coding | 0 | 2 | 5 | 4 | 5 | 5 | 5 | 56 | 5 | 4 | 6 | 3 |
| ENSMUSG00000021998 | Lcp1 | protein_coding | 95 | 87 | 97 | 126 | 124 | 73 | 134 | 1457 | 185 | 129 | 187 | 73 |
| ENSMUSG00000022534 | Mefv | protein_coding | 1 | 9 | 1 | 1 | 2 | 0 | 0 | 33 | 2 | 1 | 0 | 2 |
| ENSMUSG00000024300 | Myo1f | protein_coding | 19 | 34 | 19 | 37 | 26 | 22 | 25 | 349 | 42 | 32 | 23 | 25 |
| ENSMUSG00000024397 | Aif1 | protein_coding | 9 | 6 | 7 | 4 | 9 | 10 | 5 | 128 | 8 | 9 | 15 | 10 |
| ENSMUSG00000024529 | Lox | protein_coding | 74 | 190 | 110 | 93 | 154 | 108 | 201 | 820 | 161 | 129 | 151 | 152 |
| ENSMUSG00000024675 | Ms4a4c | protein_coding | 0 | 0 | 5 | 0 | 2 | 0 | 3 | 26 | 0 | 0 | 3 | 0 |
| ENSMUSG00000024677 | Ms4a6b | protein_coding | 13 | 32 | 27 | 24 | 42 | 16 | 21 | 179 | 23 | 18 | 23 | 19 |
| ENSMUSG00000024679 | Ms4a6d | protein_coding | 9 | 17 | 12 | 15 | 21 | 13 | 12 | 305 | 22 | 20 | 15 | 6 |
| ENSMUSG00000025044 | Msr1 | protein_coding | 10 | 7 | 12 | 15 | 22 | 2 | 2 | 358 | 23 | 14 | 15 | 17 |
| ENSMUSG00000025473 | Adam8 | protein_coding | 30 | 57 | 44 | 53 | 39 | 38 | 49 | 525 | 45 | 38 | 52 | 72 |
| ENSMUSG00000025877 | Hk3 | protein_coding | 9 | 10 | 7 | 3 | 9 | 2 | 11 | 165 | 9 | 16 | 10 | 10 |
| ENSMUSG00000026358 | Rgs1 | protein_coding | 3 | 4 | 0 | 0 | 1 | 0 | 0 | 88 | 0 | 2 | 0 | 1 |
| ENSMUSG00000029304 | Spp1 | protein_coding | 77 | 47 | 68 | 34 | 78 | 113 | 97 | 4879 | 96 | 61 | 63 | 62 |
| ENSMUSG00000029373 | Pf4 | protein_coding | 56 | 69 | 105 | 115 | 111 | 45 | 57 | 846 | 99 | 83 | 75 | 50 |
| ENSMUSG00000029915 | Clec5a | protein_coding | 8 | 10 | 1 | 7 | 4 | 8 | 7 | 86 | 11 | 6 | 10 | 6 |
| ENSMUSG00000030117 | Gdf3 | protein_coding | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 129 | 10 | 1 | 3 | 1 |
| ENSMUSG00000030144 | Clec4d | protein_coding | 0 | 0 | 0 | 2 | 1 | 4 | 3 | 110 | 4 | 0 | 5 | 3 |
| ENSMUSG00000030365 | Clec2i | protein_coding | 3 | 3 | 2 | 8 | 2 | 5 | 0 | 44 | 2 | 3 | 4 | 0 |
| ENSMUSG00000030579 | Tyrobp | protein_coding | 19 | 25 | 45 | 39 | 25 | 18 | 24 | 508 | 34 | 13 | 26 | 20 |
| ENSMUSG00000030786 | Itgam | protein_coding | 9 | 36 | 16 | 20 | 39 | 6 | 18 | 450 | 36 | 14 | 36 | 23 |
| ENSMUSG00000030789 | Itgax | protein_coding | 24 | 20 | 8 | 9 | 9 | 10 | 17 | 141 | 20 | 19 | 29 | 24 |
| ENSMUSG00000031443 | F7 | protein_coding | 1 | 9 | 4 | 3 | 1 | 2 | 16 | 180 | 4 | 3 | 13 | 7 |
| ENSMUSG00000031827 | Cotl1 | protein_coding | 76 | 128 | 118 | 132 | 133 | 101 | 134 | 1350 | 156 | 175 | 174 | 125 |
| ENSMUSG00000032122 | Slc37a2 | protein_coding | 26 | 44 | 36 | 27 | 20 | 29 | 25 | 244 | 35 | 52 | 36 | 22 |
| ENSMUSG00000032218 | Ccnb2 | protein_coding | 14 | 22 | 24 | 4 | 13 | 17 | 27 | 129 | 15 | 26 | 20 | 15 |
| ENSMUSG00000032322 | Pstpip1 | protein_coding | 8 | 29 | 19 | 15 | 26 | 28 | 30 | 186 | 30 | 36 | 34 | 29 |
| ENSMUSG00000032511 | Scn5a | protein_coding | 47 | 49 | 51 | 86 | 108 | 55 | 74 | 542 | 87 | 73 | 92 | 87 |
| ENSMUSG00000033220 | Rac2 | protein_coding | 17 | 16 | 23 | 27 | 13 | 15 | 23 | 186 | 14 | 20 | 33 | 26 |
| ENSMUSG00000033777 | Tlr13 | protein_coding | 3 | 6 | 2 | 10 | 11 | 9 | 17 | 217 | 23 | 12 | 8 | 5 |
| ENSMUSG00000034947 | Tmem106a | protein_coding | 45 | 41 | 84 | 76 | 80 | 53 | 64 | 391 | 63 | 86 | 97 | 50 |
| ENSMUSG00000035202 | Lars2 | protein_coding | 1830 | 4441 | 2177 | 1921 | 1785 | 1989 | 7700 | 1688 | 1395 | 1567 | 1838 | 1302 |
| ENSMUSG00000035373 | Ccl7 | protein_coding | 3 | 3 | 3 | 14 | 3 | 0 | 5 | 398 | 16 | 17 | 10 | 6 |
| ENSMUSG00000035385 | Ccl2 | protein_coding | 10 | 12 | 11 | 43 | 15 | 1 | 16 | 348 | 14 | 12 | 41 | 5 |
| ENSMUSG00000036887 | C1qa | protein_coding | 115 | 166 | 220 | 242 | 260 | 142 | 199 | 1303 | 256 | 195 | 226 | 180 |
| ENSMUSG00000037280 | Galnt6 | protein_coding | 4 | 4 | 4 | 2 | 3 | 3 | 4 | 108 | 7 | 1 | 6 | 3 |
| ENSMUSG00000037649 | H2-DMa | protein_coding | 28 | 39 | 31 | 55 | 20 | 24 | 59 | 227 | 37 | 36 | 47 | 49 |
| ENSMUSG00000038147 | Cd84 | protein_coding | 8 | 16 | 13 | 8 | 16 | 10 | 4 | 408 | 15 | 9 | 21 | 32 |
| ENSMUSG00000038642 | Ctss | protein_coding | 74 | 111 | 107 | 110 | 99 | 50 | 110 | 3593 | 202 | 87 | 143 | 94 |
| ENSMUSG00000040204 | Pclaf | protein_coding | 8 | 8 | 8 | 7 | 5 | 5 | 3 | 197 | 15 | 13 | 14 | 6 |
| ENSMUSG00000040552 | C3ar1 | protein_coding | 14 | 13 | 32 | 36 | 51 | 3 | 48 | 689 | 41 | 20 | 49 | 22 |
| ENSMUSG00000040747 | Cd53 | protein_coding | 21 | 24 | 32 | 51 | 38 | 15 | 39 | 373 | 52 | 30 | 38 | 38 |
| ENSMUSG00000040809 | Chil3 | protein_coding | 1 | 0 | 0 | 0 | 6 | 0 | 0 | 66 | 0 | 6 | 4 | 1 |
| ENSMUSG00000040907 | Atp1a3 | protein_coding | 5 | 7 | 3 | 4 | 5 | 6 | 3 | 148 | 8 | 3 | 13 | 12 |
| ENSMUSG00000041431 | Ccnb1 | protein_coding | 4 | 14 | 2 | 1 | 5 | 0 | 5 | 118 | 5 | 1 | 13 | 8 |
| ENSMUSG00000043091 | Tuba1c | protein_coding | 113 | 105 | 81 | 135 | 172 | 142 | 126 | 606 | 125 | 108 | 119 | 107 |
| ENSMUSG00000043157 | Arl11 | protein_coding | 4 | 4 | 5 | 3 | 11 | 2 | 8 | 114 | 8 | 1 | 3 | 4 |
| ENSMUSG00000044811 | Cd300c2 | protein_coding | 4 | 7 | 4 | 8 | 3 | 3 | 9 | 178 | 12 | 14 | 12 | 11 |
| ENSMUSG00000044827 | Tlr1 | protein_coding | 3 | 1 | 3 | 0 | 1 | 0 | 2 | 78 | 6 | 3 | 9 | 2 |
| ENSMUSG00000046805 | Mpeg1 | protein_coding | 29 | 58 | 47 | 47 | 57 | 35 | 59 | 2548 | 237 | 45 | 87 | 67 |
| ENSMUSG00000047798 | Cd300lf | protein_coding | 0 | 3 | 1 | 1 | 2 | 1 | 6 | 126 | 1 | 0 | 2 | 3 |
| ENSMUSG00000048779 | P2ry6 | protein_coding | 27 | 15 | 31 | 45 | 52 | 29 | 47 | 274 | 38 | 35 | 55 | 34 |
| ENSMUSG00000052142 | Rasal3 | protein_coding | 10 | 5 | 3 | 7 | 6 | 4 | 3 | 54 | 7 | 4 | 4 | 1 |
| ENSMUSG00000052336 | Cx3cr1 | protein_coding | 12 | 18 | 11 | 10 | 24 | 7 | 27 | 366 | 23 | 8 | 9 | 12 |
| ENSMUSG00000055850 | Rnf181 | protein_coding | 41 | 20 | 42 | 23 | 45 | 355 | 48 | 38 | 37 | 35 | 341 | 29 |
| ENSMUSG00000056069 | Fam105a | protein_coding | 29 | 28 | 28 | 22 | 29 | 26 | 30 | 270 | 37 | 30 | 25 | 31 |
| ENSMUSG00000063415 | Cyp26b1 | protein_coding | 248 | 1257 | 198 | 94 | 293 | 99 | 4027 | 59 | 111 | 54 | 37 | 191 |
| ENSMUSG00000068606 | Gm4841 | protein_coding | 3 | 0 | 161 | 1 | 1 | 2 | 0 | 45 | 8 | 265 | 37 | 14 |
| ENSMUSG00000069516 | Lyz2 | protein_coding | 408 | 806 | 793 | 912 | 972 | 506 | 672 | 12094 | 1278 | 687 | 879 | 561 |
| ENSMUSG00000070780 | Rbm47 | protein_coding | 3 | 7 | 5 | 2 | 6 | 1 | 4 | 89 | 7 | 5 | 10 | 5 |
| ENSMUSG00000073412 | Lst1 | protein_coding | 6 | 2 | 7 | 5 | 5 | 3 | 4 | 57 | 2 | 4 | 4 | 6 |
| ENSMUSG00000076258 | Gm23935 | miRNA | 2112 | 8702 | 2434 | 2297 | 1344 | 2648 | 16331 | 1687 | 1437 | 1455 | 2057 | 1453 |
| ENSMUSG00000076281 | Gm24270 | miRNA | 22 | 63 | 36 | 28 | 17 | 30 | 152 | 18 | 12 | 14 | 30 | 20 |
| ENSMUSG00000078122 | F630028O10Rik | antisense | 0 | 15 | 11 | 12 | 13 | 8 | 5 | 97 | 8 | 15 | 9 | 9 |
| ENSMUSG00000078880 | Gm14308 | protein_coding | 4 | 1 | 106 | 1 | 0 | 0 | 2 | 4 | 4 | 2 | 3 | 0 |
| ENSMUSG00000078899 | Gm4631 | protein_coding | 0 | 0 | 0 | 3 | 0 | 3 | 81 | 0 | 1 | 1 | 1 | 1 |
| ENSMUSG00000079227 | Ccr5 | protein_coding | 13 | 5 | 2 | 21 | 21 | 5 | 15 | 262 | 16 | 16 | 14 | 5 |
| ENSMUSG00000079419 | Ms4a6c | protein_coding | 3 | 12 | 11 | 8 | 11 | 7 | 9 | 85 | 9 | 6 | 5 | 5 |
| ENSMUSG00000095380 | Gm23952 | miRNA | 3 | 55 | 7 | 5 | 5 | 7 | 10 | 5 | 6 | 7 | 7 | 8 |
| ENSMUSG00000103349 | Gm36888 | lincRNA | 2 | 1 | 14 | 0 | 1 | 3 | 4 | 0 | 0 | 0 | 24 | 0 |
| ENSMUSG00000115230 | AC101945.2 | lincRNA | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 37 | 0 | 1 | 1 | 3 |
sizeFactors(dds.gastroc)
WT_1 WT_3 WT_5 WT_9 WT_11 WT_15 KO_2 KO_4
1.0098829 0.6517416 0.7363933 1.1208488 0.9796707 0.7135590 1.2740994 1.0606543
KO_6 KO_8 KO_10 KO_16
1.0397324 1.2073234 1.0876389 1.6003663
sizeFactors(dds.soleus)
WT_1 WT_3 WT_5 WT_9 WT_11 WT_15 KO_2 KO_4
0.8782302 0.9413594 0.9933758 0.9259800 1.0048036 0.8889040 1.1802624 1.1871833
KO_6 KO_8 KO_10 KO_16
0.9450776 1.1361948 1.1343119 0.9180831
plotDispEsts(dds.gastroc)
plotDispEsts(dds.soleus)
plotMA(res.gastroc, ylim = c(-6,6))
plotMA(res.soleus, ylim = c(-6,6))
topGenes.gastroc <- as.data.frame(res.gastroc) %>%
tibble::rownames_to_column("GeneID") %>%
top_n(100, wt = -padj) %>%
arrange(padj)
topGenes.soleus <- as.data.frame(res.soleus) %>%
tibble::rownames_to_column("GeneID") %>%
top_n(100, wt = -padj) %>%
arrange(padj)
knitr::kable(head(topGenes.gastroc), caption = "gastroc")
| GeneID | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj |
|---|---|---|---|---|---|---|
| ENSMUSG00000050335 | 4934.2949 | 6.267396 | 0.1664065 | 37.66316 | 0 | 0 |
| ENSMUSG00000081249 | 1378.5460 | 4.208222 | 0.1115955 | 37.70959 | 0 | 0 |
| ENSMUSG00000010751 | 1864.4937 | 5.610357 | 0.2014267 | 27.85309 | 0 | 0 |
| ENSMUSG00000032712 | 5136.5059 | 4.686418 | 0.1683204 | 27.84225 | 0 | 0 |
| ENSMUSG00000034855 | 554.3475 | 7.276611 | 0.2735037 | 26.60517 | 0 | 0 |
| ENSMUSG00000037613 | 960.7558 | 3.183338 | 0.1266510 | 25.13472 | 0 | 0 |
knitr::kable(head(topGenes.soleus), caption = "soleus")
| GeneID | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj |
|---|---|---|---|---|---|---|
| ENSMUSG00000038615 | 32047.185 | -2.5454285 | 0.0636829 | -39.97038 | 0 | 0 |
| ENSMUSG00000081249 | 1651.592 | 4.5759032 | 0.1292779 | 35.39587 | 0 | 0 |
| ENSMUSG00000039067 | 4355.557 | -1.1049116 | 0.0419786 | -26.32084 | 0 | 0 |
| ENSMUSG00000028932 | 3190.657 | -1.1102258 | 0.0558536 | -19.87741 | 0 | 0 |
| ENSMUSG00000006998 | 7402.617 | -0.9196943 | 0.0468631 | -19.62511 | 0 | 0 |
| ENSMUSG00000102869 | 6985.622 | -0.9181929 | 0.0479924 | -19.13203 | 0 | 0 |
# TODO: use ggplot for unified look of plots
hist(res.gastroc$pvalue, main = "gastroc")
hist(res.soleus$pvalue, main = "soleus")
volcanoPlot <- function(result, se, pCutoff = 0.01, FCutoff = 1, tissue = character()) {
gene_names <-
rowData(se)[rownames(result), c("gene_name"), drop = F]
results.df <- result %>%
as.data.frame() %>%
dplyr::arrange(padj)
# top 10 gene labels for respectively up and down regulation
labs.up <- results.df[results.df$log2FoldChange > FCutoff, ] %>%
rownames() %>% .[1:10] %>% gene_names[., c("gene_name")]
labs.down <- results.df[results.df$log2FoldChange < -FCutoff, ] %>%
rownames() %>% .[1:10] %>% gene_names[., c("gene_name")]
selectLab <- c(labs.up, labs.down, "Nfe2l1") %>% unique() # always including "Nfe2l1"
# custom colors:
keyvals <- ifelse(
result$log2FoldChange > FCutoff &
result$padj < pCutoff,
params$regulated_pal$upregulated,
ifelse(
result$log2FoldChange < -FCutoff &
result$padj < pCutoff,
params$regulated_pal$downregulated,
params$regulated_pal$insignificant
)
)
keyvals[is.na(keyvals)] <- params$regulated_pal$insignificant
names(keyvals)[keyvals == params$regulated_pal$upregulated] <- 'up regulated'
names(keyvals)[keyvals == params$regulated_pal$insignificant] <- 'nonsignificant'
names(keyvals)[keyvals == params$regulated_pal$downregulated] <- 'down regulated'
vlc.plt <- EnhancedVolcano(
result,
x = 'log2FoldChange',
y = 'padj',
title = 'KO vs WT: Nfe2l1 knockout',
subtitle = ifelse(isEmpty(tissue), "", paste0('tissue: ', tissue)),
caption = "",
ylab = expression(paste(-Log[10], p[adj])),
pCutoff = pCutoff,
FCcutoff = FCutoff,
legendPosition = 'right',
pointSize = 2,
colCustom = keyvals,
lab = gene_names$gene_name,
selectLab = selectLab,
labSize = 3,
boxedLabels = TRUE,
drawConnectors = TRUE,
max.overlaps = Inf
)
return(vlc.plt)
}
p <- volcanoPlot(res.gastroc, se.gastroc, pCutoff = params$pCutoff, FCutoff = params$FCutoff, tissue = "gastrocnemius")
ggsave(filename = "plots/02_volcano_gastroc.svg", p)
Saving 7 x 7 in image
ggsave(filename = "plots/02_volcano_gastroc.png", p)
p
p <- volcanoPlot(res.soleus, se.soleus, pCutoff = params$pCutoff, FCutoff = params$FCutoff, tissue = "soleus")
ggsave(filename = "plots/02_volcano_soleus.svg", p)
Saving 19.8 x 19.8 in image
ggsave(filename = "plots/02_volcano_soleus.png", p)
Saving 19.8 x 19.8 in image
p
p <- volcanoPlot(resLFC.gastroc, se.gastroc, pCutoff = params$pCutoff, FCutoff = params$FCutoff, tissue = "gastrocnemius")
Warning: One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
ggsave(filename = "plots/02_volcano_gastroc_LFC.svg", p)
Saving 7 x 7 in image
ggsave(filename = "plots/02_volcano_gastroc_LFC.png", p)
p
p <- volcanoPlot(resLFC.soleus, se.soleus, pCutoff = params$pCutoff, FCutoff = params$FCutoff, tissue = "soleus")
Warning: One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
ggsave(filename = "plots/02_volcano_soleus_LFC.svg", p)
Saving 19.8 x 19.8 in image
ggsave(filename = "plots/02_volcano_soleus_LFC.png", p)
Saving 19.8 x 19.8 in image
p
using the Wald-test stat from the DESeq2 result and
filtering on the set FCutoff=1 and
pCutoff=0.01 yields the following plot:
apply_cutoffs <- function(deseq.result, colname="stat", FCutoff, pCutoff) {
res.filtered <- deseq.result %>%
data.frame() %>%
filter(padj < pCutoff,
log2FoldChange > FCutoff | log2FoldChange < -FCutoff) %>%
dplyr::rename(!!colname := stat) %>%
dplyr::select(!!colname)
return(res.filtered)
}
gastroc_res.filtered <- apply_cutoffs(res.gastroc, colname="gastroc", params$FCutoff, params$pCutoff)
soleus_res.filtered <- apply_cutoffs(res.soleus, colname="soleus", params$FCutoff, params$pCutoff)
gene_names <- rowData(se.gastroc) %>% as.data.frame() %>%
dplyr::select(gene_name)
# combining Wald-Test data from both tissues and ordering in quadrants
res.combined <- merge(gastroc_res.filtered,
soleus_res.filtered,
by = 0) %>%
mutate(diff.exp = case_when(
gastroc < 0 & soleus < 0 ~ "both down",
gastroc > 0 & soleus > 0 ~ "both up",
gastroc < 0 & soleus > 0 ~ "gastrocnemius down,\n soleus up",
gastroc > 0 & soleus < 0 ~ "gastrocnemus up,\n soleus down",
TRUE ~ "different"
)) %>%
merge(gene_names, by.x="Row.names", by.y=0)
# removing all gene_names except the top_n_genes (sum of absolute Wald-test numbers)
top_n_genes <- 10
top_labels <- res.combined %>%
group_by(diff.exp) %>%
arrange(desc(abs(gastroc) + abs(soleus))) %>%
filter(row_number() %in% c(1:top_n_genes)) %>%
ungroup() %>%
.$gene_name
res.combined <- res.combined %>%
mutate(gene_name = ifelse(gene_name %in% top_labels, gene_name, ""))
# final plot
p <- ggplot(res.combined, aes(x = gastroc, y = soleus, label = gene_name)) +
geom_vline(xintercept = 0) +
geom_hline(yintercept = 0) +
geom_point(aes(color = diff.exp)) +
# scale_color_manual(values = c("red", "chartreuse1", "bisque", "royalblue")) +
labs(x = "gastrocnemius", y = "soleus", color = "significantly\ndifferentially\nexpressed") +
ggrepel::geom_label_repel(max.overlaps = 15) +
ggtitle(label = "DEA: t-statistics (Wald test)") +
theme_bw() +
theme(
axis.title = element_text(size = 13),
axis.text = element_text(size = 13),
title = element_text(size = 13),
legend.text = element_text(size = 10)
)
ggsave(filename = "plots/02_dea_scatter.svg", p)
Saving 7 x 7 in image
ggsave(filename = "plots/02_dea_scatter.png", p)
p
p <- ggplot(res.combined, aes(x = diff.exp)) +
geom_bar(aes(fill = diff.exp)) +
theme_bw()
ggsave(filename = "plots/02_dea_scatter_barplot.svg", p)
Saving 7 x 7 in image
ggsave(filename = "plots/02_dea_scatter_barplot.png", p)
p
boxplot.genes <- function(counts.df, title ="", FCutoff = 1) {
ggplot(counts.df,
aes(
# x = as.factor(gene_name),
x = reorder(gene_name,ensembl),
y = normalized_counts,
fill = genotype
)) +
geom_dotplot(
binaxis = 'y',
stackdir = 'center',
dotsize = 0.3,
position = position_dodge(0.8),
fill = "black"
) +
geom_boxplot(outlier.size = 0.3) +
scale_y_log10() +
xlab("Genes") +
ylab("Normalized Counts") +
scale_fill_manual(values = params$condition_pal) +
ggtitle(title) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(
angle = 45,
hjust = 1,
face = setBold(counts.df$gene_name, c("Nfe2l1"))
))
}
prepare_counts <- function(gene_ids, dds, se, reorder_genes = F) {
counts.top <- counts(dds, normalized = T)[gene_ids, ]
metadata <- colData(se) %>% as.data.frame()
gene_names <-
rowData(se)[, c("gene_name"), drop = F] %>% as.data.frame()
# the prepared counts df for the plot
counts.plt <-
data.frame(counts.top) %>%
tibble::rownames_to_column(var = "ensembl") %>%
tidyr::gather(key = "samplename",
value = "normalized_counts", 2:13) %>%
merge(metadata, by.x = "samplename", by.y = 0) %>%
merge(gene_names, by.x = "ensembl", by.y = 0) %>%
mutate(genotype = factor(genotype)) %>%
mutate(genotype = relevel(genotype, "WT")) # "WT" needs to be displayed before "KO"
if (reorder_genes) {
counts.plt <- counts.plt %>%
mutate(
gene_name = forcats::fct_reorder(gene_name, normalized_counts, .desc = T),
ensembl = forcats::fct_reorder(ensembl, normalized_counts, .desc = T)
)
} else {
idx = match(counts.plt$ensembl, gene_ids)
counts.plt <- counts.plt[order(idx),] %>%
mutate(gene_name = factor(gene_name))
counts.plt$ensembl <-
factor(counts.plt$ensembl, levels = gene_ids, ordered = TRUE)
counts.plt$gene_name <- factor(counts.plt$gene_name)
}
return(counts.plt)
}
# helper function to set the font of a label on the axis to bold
setBold <- function(src, special_labs) {
# source: https://stackoverflow.com/questions/39694490/highlighting-individual-axis-labels-in-bold-using-ggplot2
if (!is.factor(src))
src <- factor(src)
src_levels <- base::levels(src)
brave <- special_labs %in% src_levels
b_vec <- rep("plain", length(src_levels))
if (all(brave)) {
b_pos <- purrr::map_int(special_labs, ~ which(. == src_levels))
b_vec[b_pos] <- "bold"
b_vec
} else {
message("setBold: no matching element found")
}
return(b_vec)
}
# actual plot call
for (group in unique(res.combined$diff.exp)) {
gene_ids <- filter(res.combined, diff.exp == group)$`Row.names`
prep_counts.gastroc <- prepare_counts(gene_ids, dds.gastroc, se.gastroc, reorder_genes = T)
prep_counts.soleus <-
prepare_counts(levels(prep_counts.gastroc$ensembl), dds.soleus, se.soleus, reorder_genes = T)
gene_ids <- levels(prep_counts.gastroc$ensembl)
p.g <- boxplot.genes(
counts.df = prep_counts.gastroc,
title = paste0("Sign. regulated genes: ", group, "\n tissue: ", "gastrocnemius")
) #%>% print()
p.s <- boxplot.genes(
counts.df = prep_counts.soleus,
title = paste0("\n tissue: ", "soleus")
) #%>% print()
ggpubr::ggarrange(p.g, p.s, ncol = 1, nrow = 2)
}
setBold <- function(src, special_labs) {
# source: https://stackoverflow.com/questions/39694490/highlighting-individual-axis-labels-in-bold-using-ggplot2
if (!is.factor(src))
src <- factor(src)
src_levels <- base::levels(src)
brave <- special_labs %in% src_levels
b_vec <- rep("plain", length(src_levels))
if (all(brave)) {
b_pos <- purrr::map_int(special_labs, ~ which(. == src_levels))
b_vec[b_pos] <- "bold"
b_vec
} else {
message("setBold: no matching element found")
}
return(b_vec)
}
getTopExpressedEnsemblNames <- function(result,
FCutoff,
n,
up = T) {
.filter <- ifelse(up, `>`, `<`)
subset(result, .filter(log2FoldChange, FCutoff)) %>%
data.frame() %>%
filter(baseMean > 100) %>%
arrange(padj) %>%
.[1:n, ] %>%
rownames()
}
boxplot.top <- function(result, dds, se, upregulated=TRUE, n = 20, FCutoff = 1, tissue ="") {
# getting the top n regulated genes
names.top <- getTopExpressedEnsemblNames(result, FCutoff=FCutoff, n=n, up = upregulated)
counts.top <- counts(dds, normalized=T)[names.top, ]
metadata <- colData(se) %>% as.data.frame()
gene_names <- rowData(se)[, c("gene_name"), drop = F] %>% as.data.frame()
counts.plt <-
data.frame(counts.top) %>%
tibble::rownames_to_column(var = "ensembl") %>%
tidyr::gather(key = "samplename",
value = "normalized_counts", 2:13) %>%
merge(metadata, by.x="samplename", by.y=0) %>%
merge(gene_names, by.x="ensembl", by.y=0) %>%
mutate(genotype = factor(genotype)) %>%
mutate(genotype = relevel(genotype, "WT")) %>% # "WT" needs to be displayed before "KO"
mutate(gene_name = forcats::fct_reorder(gene_name, normalized_counts, .desc = T))
direction <- ifelse(upregulated, "up", "down")
ggplot(counts.plt,
aes(
x = as.factor(gene_name),
y = normalized_counts,
fill = genotype
)) +
geom_dotplot(
binaxis = 'y',
stackdir = 'center',
dotsize = 0.3,
position = position_dodge(0.8),
fill = "black"
) +
geom_boxplot(outlier.size = 0.3) +
scale_y_log10() +
xlab("Genes") +
ylab("Normalized Counts") +
scale_fill_manual(values = params$condition_pal) +
ggtitle(paste0("Top ", n, " ", direction, "-regulated Genes\n tissue: ", tissue)) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(
angle = 45,
hjust = 1,
face = setBold(counts.plt$gene_name, c("Nfe2l1"))
))
}
boxplot.top(res.gastroc, dds.gastroc, se.gastroc, upregulated = T, tissue = "gastrocnemius")
boxplot.top(res.gastroc, dds.gastroc, se.gastroc, upregulated = F, tissue = "gastrocnemius")
boxplot.top(res.soleus, dds.soleus, se.soleus, upregulated = T, tissue = "soleus")
boxplot.top(res.soleus, dds.soleus, se.soleus, upregulated = F, tissue = "soleus")
significant grouped:
venn.colors <- c(params$tissue_pal, scales::hue_pal()(4))
# numbers for euler diagram
gene_sets <- c(
"gastrocnemius" = sign_gene_stats$gastroc - sign_gene_stats$shared_sig_genes,
"soleus" = sign_gene_stats$soleus - sign_gene_stats$shared_sig_genes,
"gastrocnemius&soleus" = sign_gene_stats$shared_sig_genes
)
# euler: two tissues
p <- plot(
euler(gene_sets),
quantities = T,
legend = list(side = "right"),
fills = venn.colors,
main = "significant genes"
)
p
Replacing the raw counts with normalized counts for better visual representation of the performed DEA:
counts.gastroc <- counts(dds.gastroc, normalized = T)
counts.soleus <- counts(dds.soleus, normalized = T)
The genes previously determined significant
(tissue_res.filtered) in both tissues will be clustered via
hierachical clustering and a PCA will be performed by taking the union
of the genes.
# obtaining all DE genes, but avoiding duplicates
sign_genes <-
unique(c(
row.names(gastroc_res.filtered),
row.names(soleus_res.filtered)
))
# making sure, that DE genes occur in both tissues
sign_genes <-
sign_genes[sign_genes %in% rownames(counts.gastroc) &
sign_genes %in% rownames(counts.soleus)]
counts_sign <- merge(
counts.gastroc[sign_genes,],
counts.soleus[sign_genes,],
by = 0,
suffixes = c("_gastrocnemius", "_soleus")
) %>%
tibble::column_to_rownames(var="Row.names") %>%
as.matrix()
A zscore is applied to the counts to visualize the differential expression of the normalized counts in a heatmap.
zscore <- function(M) {
s <- apply( M,1,sd ) # standard deviation
µ <- apply( M, 1, mean ) # mean
M.z <- (M - µ) / s # zscore
return(M.z)
}
# using the function from "code/visualizations_functions.R"
chm <- plot_combined_CHM(zscore(counts_sign), params=params)
PCA is applied to the same significantly DE genes (normalized) as used in the heatmap.
# using the function from "code/visualizations_functions.R"
p <- plot_combined_pca(counts_sign, params=params)
p
save(dds.gastroc, dds.soleus, res.gastroc, res.soleus, file = "./data/Robjects/02_DDS.RData")